Bioinformatics A Practical Guide to Next Generation Sequencing Data Analysis (Hamid D. Ismail)

122 ◾ Bioinformatics

sh” and copy and save the following script on it and then execute the bash file as “bash

pipeline_bcftools.sh”:

#!/bin/bash

#Sars-Cov2 variant calling

#-------------------------

#1- download fastq files from the NCBI SRA database

mkdir fastq

while read f;

fasterq-dump --progress --outdir fastq “$f”

done < ids.txt

#2- download and extract the reference genome

mkdir ref

cd ref

wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/

GCF_009858895.2_ASM985889v3/GCF_009858895.2_ASM985889v3_genomic.

fna.gz

#Extract the compressed the reference FASTA file

f=$(ls *.*)

gzip -d ${f}

#3- Index the reference FASTA file using samtools and bwa

f=$(ls *.*)

samtools faidx ${f}

bwa index ${f}

cd ..

#4- Align the fastq reads (multiple samples) to the reference

genome

mkdir sam

cd fastq

for i in $(ls *.fastq | rev | cut -c 9- | rev | uniq);

bwa mem -M -t 4 \

-R “@RG\tID:${i}\tSM:${i}” \

../ref/GCF_009858895.2_ASM985889v3_genomic.fna \

${i}_1.fastq ${i}_2.fastq > \

../sam/${i}.sam 2> ../sam/${i}.log;

done

cd ..

#5- convert SAM files into BAM files

mkdir bam

cd sam

for i in $(ls *.sam | rev | cut -c 5- | rev);

samtools view -uS -o ../bam/${i}.bam ${i}.sam

done

cd ..